📦 安裝相依套件(新增)
npm i pdf-parse mammoth
🆕 程式碼
/** 建立資料夾 */
export function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}
/** 由 Content-Type 推測副檔名(簡化版) */
function extFromContentType(ct = "") {
if (ct.includes("pdf")) return ".pdf";
if (ct.includes("msword")) return ".doc";
if (ct.includes("officedocument.wordprocessingml.document")) return ".docx";
if (ct.includes("text/plain")) return ".txt";
if (ct.includes("markdown")) return ".md";
return ".bin";
}
/** 下載遠端檔案至暫存(Node 18+ 有 fetch) */
export async function downloadToTemp(url, outDir = "outputs/downloads") {
ensureDir(outDir);
const res = await fetch(url);
if (!res.ok) throw new Error(下載失敗:${res.status} ${res.statusText}
);
const buf = Buffer.from(await res.arrayBuffer());
const ct = res.headers.get("content-type") || "";
const ext = extFromContentType(ct);
const fp = path.join(outDir, dl_${Date.now()}${ext}
);
fs.writeFileSync(fp, buf);
return fp;
}
/** 讀純文字(.txt/.md) */
function readTxtLike(filePath) {
return fs.readFileSync(filePath, "utf-8");
}
/** 讀 PDF → text */
async function readPdf(filePath) {
const buf = fs.readFileSync(filePath);
const data = await pdfParse(buf);
return data.text || "";
}
/** 讀 DOCX → text */
async function readDocx(filePath) {
const buf = fs.readFileSync(filePath);
const { value } = await mammoth.extractRawText({ buffer: buf });
return value || "";
}
/** 依副檔名讀取文字內容 */
export async function readTextFile(filePath) {
const ext = path.extname(filePath).toLowerCase();
if (ext === ".txt" || ext === ".md") return readTxtLike(filePath);
if (ext === ".pdf") return await readPdf(filePath);
if (ext === ".docx") return await readDocx(filePath);
throw new Error(不支援的檔案格式:${ext}
);
}
/** 輸出檔案:JSON/MD */
export function writeJson(filePath, obj) {
fs.writeFileSync(filePath, JSON.stringify(obj, null, 2), "utf-8");
}
export function writeText(filePath, text) {
fs.writeFileSync(filePath, text, "utf-8");
}
/** 粗略切塊(以字元數近似 token,預設每塊 ~1200 字) */
function chunkText(text, chunkSize = 1200, overlap = 120) {
const chunks = [];
let i = 0;
while (i < text.length) {
const end = Math.min(text.length, i + chunkSize);
chunks.push(text.slice(i, end));
i = end - overlap; // 保留重疊避免斷句過硬
if (i < 0) i = 0;
}
return chunks;
}
/** 單塊摘要(map 階段) */
async function summarizeChunk(chunk, opts) {
const { tone = "professional", length = "medium" } = opts || {};
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.3,
messages: [
{
role: "system",
content:
"你是嚴謹的中文技術編輯。請以重點清單 + 2~3 句摘要回覆,不可虛構內容。",
},
{
role: "user",
content: 請摘要以下內容。語氣:${tone}。長度:${length}。\n\n
+ chunk,
},
],
});
return res.choices?.[0]?.message?.content?.trim() || "";
}
/** reduce 階段:彙整所有塊的摘要成一份高階摘要+重點清單 */
async function reduceSummaries(summaries, opts) {
const { tone = "professional", length = "medium" } = opts || {};
const joined = summaries.map((s, i) => # 小節${i + 1}\n${s}
).join("\n\n");
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.3,
messages: [
{
role: "system",
content:
"你是嚴謹的中文總編輯。整合所有小節摘要,產出:\n1) TL;DR(3~5 句)\n2) Outline(6~12 條)\n3) KeyPoints(5~10 條)\n4) ActionItems(可執行清單,若無則空陣列)\n5) Questions(讀者可能想追問的 3~6 題)\n請以純 JSON 回覆,格式:{"tldr":"...","outline":[...],"keyPoints":[...],"actionItems":[...],"questions":[...]}\n內容不得虛構。",
},
{
role: "user",
content: 語氣:${tone},長度:${length}。\n以下為各小節摘要,請彙整:\n\n${joined}
,
},
],
});
const raw = res.choices?.[0]?.message?.content?.trim() || "{}";
const json = raw.match(/(?:json)?\s*([\s\S]*?)
/i)?.[1] ?? raw;
return JSON.parse(json);
}
/** 從原文抓出可能適合引用的金句(避免虛構) */
async function extractQuotes(original, limit = 5) {
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.2,
messages: [
{
role: "system",
content:
"從原文中挑選最多 5 句適合直接引用的『原句』,不可改寫;若沒有合適句子可少於 5。",
},
{
role: "user",
content: 請由以下原文挑選金句(以 JSON 陣列回覆):\n${original.slice(0, 16000)}
,
},
],
});
const raw = res.choices?.[0]?.message?.content?.trim() || "[]";
const json = raw.match(/(?:json)?\s*([\s\S]*?)
/i)?.[1] ?? raw;
try { return JSON.parse(json); } catch { return []; }
}
/**
// 1) 取得檔案本地路徑
if (!filePath && !url) throw new Error("請提供 filePath 或 url 其一。");
if (!filePath && url) filePath = await downloadToTemp(url);
// 2) 讀取文件文字
const fullText = (await readTextFile(filePath)).trim();
if (!fullText) throw new Error("文件內容為空或無法解析。");
// 3) 切塊 & map
const chunks = chunkText(fullText, chunkSize, overlap);
const perChunkSummaries = [];
for (const c of chunks) {
const s = await summarizeChunk(c, { tone, length });
perChunkSummaries.push(s);
}
// 4) reduce 彙整
const merged = await reduceSummaries(perChunkSummaries, { tone, length });
// 5) 金句(直接取原文)
const quotes = await extractQuotes(fullText, 5);
// 6) 組裝輸出
const titleGuess = path.basename(filePath);
const result = {
title: titleGuess,
wordCount: fullText.length,
chunks: chunks.length,
tldr: merged.tldr || "",
outline: merged.outline || [],
keyPoints: merged.keyPoints || [],
actionItems: merged.actionItems || [],
questions: merged.questions || [],
quotes,
createdAt: new Date().toISOString(),
};
// 7) 落檔
const outDir = path.join("outputs", "docs");
ensureDir(outDir);
const stamp = Date.now();
const jsonPath = path.join(outDir, summary_${stamp}.json
);
const mdPath = path.join(outDir, summary_${stamp}.md
);
writeJson(jsonPath, result);
writeText(
mdPath,
[# ${result.title}
,, `- 產出時間:${result.createdAt}`, `- 字數:約 ${result.wordCount}`, `- 分塊數:${result.chunks}`,
,## TL;DR
,
result.tldr || "(無)",, `## Outline`, ...(result.outline?.map((o, i) => `${i + 1}. ${o}`) || ["(無)"]),
,## Key Points
,
...(result.keyPoints?.map((o) => - ${o}
) || ["(無)"]),, `## Action Items`, ...(result.actionItems?.map((o) => `- [ ] ${o}`) || ["- (無)"]),
,## Questions
,
...(result.questions?.map((o) => - ${o}
) || ["- (無)"]),, `## 引用金句`, ...(result.quotes?.map((q) => `> ${q}`) || ["> (無)"]),
,
].join("\n")
);
return { jsonPath, mdPath, meta: result };
}
// ...前略(既有 args 解析與其他 task)
async function main() {
const task = args.task || "chat";
if (task === "docsum") {
const filePath = args.filePath || null;
const url = args.url || null;
const length = args.length || "medium"; // short | medium | long
const tone = args.tone || "professional"; // friendly | professional
const chunkSize = args.chunkSize ? Number(args.chunkSize) : 1200;
const overlap = args.overlap ? Number(args.overlap) : 120;
const out = await summarizeDocument({ filePath, url, length, tone, chunkSize, overlap });
console.log("\n=== 文件摘要完成 ===");
console.log("- JSON:", out.jsonPath);
console.log("- Markdown:", out.mdPath);
console.log("\nTL;DR:\n", out.meta.tldr);
// ...其餘 task 分支維持不變
} else {
// 既有的 else 分支省略
}
}
main().catch((e) => {
console.error("發生錯誤:", e.message);
process.exit(1);
});
▶️ 如何執行(CLI)
npm run day11:pdf --silent
npm run day11:txt --silent
npm run day11:url --silent
完成後你會看到:
outputs/docs/summary_17265xxxxx.json
outputs/docs/summary_17265xxxxx.md